/*

01_make_data.do
(this do file also calls the do files: 01a_app_use, 01b_sim_use,
	01c_email_survey)

Purpose: create student-level panel that incorporates all coviarates and outcomes
	from the raw data sets
Inputs: smartchoice_2019, smartchoice_2020, ratex-table-2019, ratex-table-2020,
	email_logs, logins, parents, randomizations, students, smartchoice_all, races,
	cities, students_geocodes, choicesets-student-panel, email_survey,
	sim-use-student-level, app-use-student-level
Outputs: placements, ratex2019, ratex2020, emails, logins, randomizations, 
	students, students_update, student-level-panel
	
*/

/// Placements
clear
foreach year in 2019 2020 {
	preserve
		insheet using "$preprocessed_smartchoice/smartchoice_`year'.csv", clear

		keep studentid rank  placed_dayof_bestoffer ///
			placed_any_bestoffer default_school_indicator academyid resident year

		ren *_bestoffer *
		ren placed_any placed_ever
		
		bys studentid: egen max_rank = max(rank)
		bys studentid: egen placed_any = max(placed_dayof)
		
		* Fix dayof placement to nbhd schools for resident HS applicants
		replace placed_dayof = 1 if placed_any == 0 & default_school_indicator == 1
		drop placed_any default_school_indicator max_rank
		
		reshape wide placed_dayof placed_ever academyid , i(studentid) j(rank)
		
		egen placed_any = rowmax(placed_dayof?)
		egen placed_ever_any = rowmax(placed_ever?)
		gen placed_other = placed_any == 1 & placed_dayof1 == 0 
		gen unplaced = placed_any == 0 
		
		tempfile placements
		save `placements', replace
	restore
	append using `placements'
}

save "$int/placements", replace

/// Ratex from 2020 and 2019
foreach year in 2019 2020 {
	insheet using "$sim/ratex-table-`year'.csv", clear
	ren ratex ratex_real
	ren priority priority_real
	
	save "$int/ratex`year'", replace
}

/// Emails sent
insheet using "$email_logs", clear

split ts_sent, parse(".")
drop ts_sent
ren ts_sent1 ts_sent 
drop ts_sent?

gen double email_ts = clock(ts_sent,"YMD hms")
format email_ts %tc

gen email_received = bounced == 0 & status == "sent" 
gen year = 2020

save "$int/emails", replace

//// Logins 
insheet using "$logins", names clear

split ts_login, parse(".")
gen double login_ts = clock(ts_login1, "YMD hms")
	
format login_ts %tc
drop ts*

* Merge on parentids
preserve
	insheet using "$smartchoice_final2020/parents.csv", clear
	
	keep email parentid 
	
	replace email = trim(upper(email))
	tempfile parentids
	save `parentids', replace
restore

replace email = trim(upper(email))
joinby email using `parentids', unmatched(both)
tab _merge
keep if _merge==3

assert _merge == 3 // good
drop _merge 
ren email login_email

bys parentid (login_ts): gen time_diff = seconds(login_ts[_n] - login_ts[_n-1])

* A login within same parent account with 15 second of a previous one is not counted
drop if time_diff < 15

* Create login attempt running variable
bys parentid (login_ts): egen login = rank(login_ts), track
bys parentid: egen logins = max(login)

drop time_diff 

* Bring to parent level
reshape wide login_ts , i(parentid) j(login)

gen year = 2020
save "$int/logins", replace

//// Randomizations
insheet using "$randomizations", clear

gen double rand_ts = clock(rand_date, "YMD hms")
format rand_ts %tc
sort rand_ts
split rand_date,p(" ")
drop rand_date rand_date2
ren rand_date1 rand_date

egen block = group(rand_ts)

gen year = 2020

save "$int/randomizations", replace

////////////////// Create studentlevel dataset ///////////////////////////

insheet using "$smartchoice_final2020/students.csv", clear
gen year = 2020
preserve
	insheet using "$smartchoice_final2019/students.csv", clear
	gen year = 2019
	tempfile data2019
	save `data2019', replace
restore

append using `data2019', force

keep studentid parentid student_district_id fname lname currentschool ///
	zone gender age current_grade grade race address last_login ///
	enroll_status active city sibling guardian1email ///
	ethnicity year dob address address_lat address_lng zip

* Subset on applications that appear in the file that is run in the lottery
* i.e. no admin-added records that did not apply, or admin-deleted applications
preserve
	insheet using "$preprocessed_smartchoice/smartchoice_all.csv", clear
	keep if year >= 2019		
	keep studentid year 
	duplicates drop
	tempfile final
	save `final', replace
restore
merge 1:1 studentid year using `final', keep(3) nogen 

* Make grade variable numeric
foreach var in grade current_grade {
	ren `var' `var'_str
	destring `var'_str, gen(`var') force
	replace `var' = -2 if `var'_str == "PreK3"
	replace `var' = -1 if `var'_str == "PreK4"
	replace `var' = 0 if `var'_str == "K"
	drop *_str
}

isid studentid year	

* Dayof placements and resident status
merge 1:1 studentid year using "$int/placements", update replace gen(_mplace)
drop if _mplace == 2

/////////////////////// Pre-Treatment variables //////////////////////////////
gen prek4 = grade == -2
gen prek3 = grade == -1
gen k = grade == 0
gen hs = grade == 9

* Race and ethnicity 
preserve
	insheet using "$smartchoice_final2020/races.csv" , clear 
	keep raceid race_name
	tempfile race
	save `race' , replace
restore

* clean old race var, a str which looks like "Black|White|Asian"
split race , gen(raceid) parse("|")
replace race = ""
forv i = 1/3 {
ren raceid`i' raceid
destring raceid, replace

merge m:1 raceid using `race' , keep(1 3) nogen

replace race = race + " " + race_name
drop raceid race_name
}
replace race = trim(race)

gen hispanic = ethnicity == "Hispanic" if !mi(ethnicity)

gen black = strpos(race,"Black") > 0 & hispanic == 0 if !mi(race)
gen white = strpos(race,"White") > 0 & black == 0 & hispanic ==0 if !mi(race)

drop raceid*
* Update resident status for students in in smartchoice files
preserve
	insheet using "$smartchoice_final2020/cities.csv" , clear 
	keep cityid city_name
	tempfile cities
	save `cities' , replace
restore

destring city, gen(cityid) force
merge m:1 cityid using `cities', keep(1 3) nogen

replace city_name = city if mi(cityid)
drop city cityid
ren city_name city

replace resident = city == "New Haven" if mi(resident)
replace city = "New Haven" if resident == 1
replace city = lower(city)

* Repeat grade
gen repeat_grade = ///
	current_grade == grade if !mi(current_grade) & !mi(grade)

/// Students in preK or grade 9
preserve
keep if grade <= 0 | grade == 9 
save "$int/students", replace
restore

/// Merge lats and lons
merge 1:1 studentid year using "$geocodes/students_geocodes", ///
	keepusing(lat lon) keep(1 3) nogen

/// Students in all grades + lattitude and longitude info
save "$int/students_update", replace

////////////////////////// Treatment variables /////////////////////////////

/// Merge on email sent, login behavior, app choice behavior and rando and ratex info
merge 1:1 studentid year using "$int/emails", gen(_memail) keepusing(email_received email_ts) 
merge m:1 parentid year using "$int/logins", gen(_mlogin) keep(1 3) // avoid duplicates
merge 1:1 studentid year using "$int/randomizations", gen(_mrand)

// Drop records that are neither in smartchoice dayof files, nor any of the treatment files
drop if _mplace == 1 & _mrand == 1 & _memail == 1 & _mlogin == 1

replace email_received = 0 if mi(email_received)
replace logins = 0 if mi(logins)
gen warnings_email = email_received & treat == 3

/// App use
preserve
	do 01a_app_use.do
restore

/// Sim use 
preserve
	do 01b_sim_use.do
restore 

/// Email survey
preserve
	do 01c_email_survey.do
restore

/////////////////////////////// Outcomes ///////////////////////////////////
// Email survey
merge m:1 parentid year using "$int/email_survey", keep(1 3) gen(_memailsurv) ///
	keepusing(submit_full_app satisfaction* feature* MP1 prefer* strategic*)

// Simulater Use
merge m:1 studentid year using "$int/sim-use-student-level", ///
	gen(_msim) keepusing(N_* sim_* )
	
// Application changes
merge 1:m studentid year using "$int/app-use-student-level", ///
	keep(1 2 3) gen(_mapp) ///
	keepusing(lengthen_app* shorten_app* risk_* change_school* new* modify* data any_baseline)

tab _mapp year
if _mapp == 2 assert year == 2019

* Fix app_risk for 2019 applicants
replace app_risk = risk_final_initial if data == "changelog" & year == 2019

replace data = "None" if mi(data)
isid studentid year

* Set sim variables to 0 for applicants who do not use the simulator
foreach var of varlist N_* sim_* {
	replace `var' = 0 if mi(`var') & _msim == 1
}

* Fix initial simulated schools 
replace N_schools_initial = 0 if _msim == 1

* Identify the Simulator and RCT sample
gen rct = treat == 0 | treat == 1 

* Identify simulator-eligible markets
gen simulator_eligible = grade <= 0 | grade == 9 

* Identify Simulator-eligible applications <=> apply before randomization ends
gen simulator_app_eligible = simulator_eligible & _mapp == 3 // We find you in the app-use data
gen simulator = !mi(treat) 

** Very few students are in the app-use data but are from a non-rct-eligible grade
** They either applied to the wrong grade at the time of randomizaton (rarely happens) OR
** List at least one school that didn't exist in 2019 which makes it impossible to assess their portfolio risk

* Set app use outcomes to missing for applicants that have not been randomized in 2020
foreach var of varlist lengthen_app* shorten_app* risk*diff change_school* new* modify* {
	replace `var' = . if simulator == 0 & year == 2020
}
replace data = "None" if simulator_eligible == 0 

* Simulator outcomes
gen any_login = logins > 0 
gen login_count = logins 
gen any_sim = _msim == 3

foreach def in "_ever" "_final" {
	gen login_lengthen`def' = any_login == 1 & lengthen_app`def' == 1 if _mapp == 3
	gen login_change_school`def' = any_login == 1 & change_school`def' == 1 if _mapp == 3
	gen login_modify`def' = any_login == 1 & modify`def' == 1 if _mapp == 3
}

gen female = gender == "Female" if !mi(gender)

* Risk deciles
cap drop riskq
egen riskq = cut(app_risk), at(0(.1)1)
replace riskq = .9 if app_risk == 1 & mi(riskq)

* Variable labels
label var female "Female"
label var login_count "Number of Logins"
label var email_received "Received email"
label var warnings_email "Warnings email"
label var any_login "Pr(Any login)"
label var white "White"
label var black "African American"
label var hispanic "Hispanic"
label var resident "NH Resident"
label var grade "Grade"
label var repeat_grade "Repeat grade"
label var prek4 "PreK4"
label var prek3 "PreK3"
label var k "K"
label var hs "Highschool"
label var app_risk "Risk score"
label var risk_real_initial "Initial realized risk"
label var risk_real_final "Final realized risk"
label var risk_sim_initial "Initial simulated risk"
label var risk_sim_final "Final simulated risk"
label var risk_final_initial "Initial simulated risk using final priorities"
label var risk_final_final "Final simulated risk using final priorities"
label var rct "RCT sample"
label var simulator "Simulator sample"
label var simulator_eligible "Simulator eligible grade"
label var simulator_app_eligible "Apply in time"
label var data "Data source of app use data"
foreach def in "_ever" "_final" {
	label var login_lengthen`def' "Access sim. + lengthen app"
	label var login_change_school`def' "Access sim. + change school"
	label var login_modify`def' "Access sim. + change app."
}

label var any_sim "Pr(Any sim. run)"
label var N_sims "Simulations"
label var N_schools_all "Simulated schools (all)"
label var N_schools_initial "Simulated schools (in initial app)"
label var N_schools_not_initial "Simulated schools (not in initial app.)"

label var sim_not_initial_app "Any extra simulation"
label var sim_shorter_app "Shorten app"
label var sim_longer_app "Lengthen app"
label var sim_diff_schools  "Change order/schools"
label var sim_diff_any "Change length, order or schools"
label var sim_lower_risk "Lower risk"
label var sim_risk_diff "Max. risk reduction"
label var riskq "Simulated risk decile"
label var placed_any "Any placement"
label var placed_other "Placed other"
label var placed_dayof1 "Placed $1^{st}$"
label var unplaced "Unplaced"

save "$int/student-level-panel.dta", replace